library(tidyverse)
library(tigris)
library(censusapi)
library(sf)
library(mapview)
library(plotly)
library(lehdr)

options(
  tigris_class = "sf",
  tigris_use_cache = T 
)

Loading Blockgroups and Social Distancing Data

The code below loads in the Safegraph Social Distancing dataset.

scc_blockgroups <-
  block_groups("CA","Santa Clara", cb=F, progress_bar=F)

# Below uses tracts sent to us by San Jose
sj_tracts <- st_read("/users/ctenner/Pcloud Drive/SFBI/Data Library/San_Jose/CSJ_Census_Tracts/CSJ_Census_Tracts.shp") %>% 
  st_as_sf() %>% 
  st_transform(4269)
## Reading layer `CSJ_Census_Tracts' from data source `/Users/ctenner/pCloud Drive/SFBI/Data Library/San_Jose/CSJ_Census_Tracts/CSJ_Census_Tracts.shp' using driver `ESRI Shapefile'
## Simple feature collection with 219 features and 9 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 6112856 ymin: 1869687 xmax: 6255982 ymax: 1996555
## epsg (SRID):    2227
## proj4string:    +proj=lcc +lat_1=38.43333333333333 +lat_2=37.06666666666667 +lat_0=36.5 +lon_0=-120.5 +x_0=2000000.0001016 +y_0=500000.0001016001 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=us-ft +no_defs
sj_citycouncil_disticts <- st_read("/Users/ctenner/Desktop/Classes/118/City Council Districts/CITY_COUNCIL_DISTRICTS.shp") %>% 
  st_as_sf() %>% 
  st_transform(4269)
## Reading layer `CITY_COUNCIL_DISTRICTS' from data source `/Users/ctenner/Desktop/Classes/118/City Council Districts/CITY_COUNCIL_DISTRICTS.shp' using driver `ESRI Shapefile'
## Simple feature collection with 10 features and 7 fields
## geometry type:  POLYGON
## dimension:      XY
## bbox:           xmin: 6112856 ymin: 1869687 xmax: 6255982 ymax: 1996555
## epsg (SRID):    2227
## proj4string:    +proj=lcc +lat_1=38.43333333333333 +lat_2=37.06666666666667 +lat_0=36.5 +lon_0=-120.5 +x_0=2000000.0001016 +y_0=500000.0001016001 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=us-ft +no_defs
sj_blockgroups <- 
  scc_blockgroups %>% 
  st_centroid() %>% 
  st_join(sj_tracts, left = F) %>% 
  st_join(sj_citycouncil_disticts%>% dplyr::select(DISTRICTS)) %>% 
  mutate(
    DISTRICTS = DISTRICTS %>% factor(levels = c("1","2","3","4","5","6","7","8","9","10"))
  ) %>% 
  st_set_geometry(NULL) %>% 
  left_join(scc_blockgroups%>% dplyr::select(GEOID), by = "GEOID") %>% 
  st_as_sf() %>% 
  dplyr::select(GEOID, DISTRICTS)

# the spatial join leaves off two blockgroups which are touching district 9. The following code assigns those to district 9
sj_blockgroups$DISTRICTS[is.na(sj_blockgroups$DISTRICTS)] <- 9

mapview(sj_blockgroups, zcol = "DISTRICTS")
sj_socialdistancing <- readRDS("/users/ctenner/Pcloud Drive/SFBI/Restricted Data Library/Safegraph/covid19analysis/sj_socialdistancing.rds") %>% 
  mutate(date = date_range_start %>%  substr(1,10) %>% as.Date()) %>% 
  left_join(sj_blockgroups, by = c("origin_census_block_group"="GEOID")
  ) %>% 
  filter(!is.na(DISTRICTS)) %>% 
  dplyr::select(-geometry)

Median Household Income Analysis

My first analysis looks at the median household income of each block group in San Jose and how it correlates to the percent of residents who have been staying completely at home in the past three days.

Sys.setenv(CENSUS_KEY=" 6c05445ae84c23e1c62fd91756d0f56f51ae94ef")

# below code creates a "lookup" table that will be used later on
acs_vars <-
  listCensusMetadata(
    name = "2018/acs/acs5",
    type = "variables"
  )

# following code loads in census data and processes it to be more readable
sj_median_income_by_block <-
  getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*", 
    regionin = "state:06+county:085",
    vars = "B19013_001E",
    key =  "6c05445ae84c23e1c62fd91756d0f56f51ae94ef"
  ) %>%
  mutate(
    blockgroup =
      paste0(state,county,tract,block_group)
  ) %>% 
  select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>% 
  rename(
    Median_Income = B19013_001E 
  ) %>% 
  filter(!is.na(Median_Income)) %>% 
  left_join(sj_blockgroups, by = c("blockgroup" = "GEOID")) %>% #this code gives each blockgroup a district designation
  dplyr::select(-geometry) %>% # comment out this line if geometry is desired 
  filter(
    !is.na(DISTRICTS)
  ) %>% 
  
  # this code joins our census data with the social distancing data, processed as shown below
  left_join(sj_socialdistancing %>%  
                          filter(date > max(date)-3) %>% 
                          group_by(origin_census_block_group) %>% 
                          summarize(
                                    completely_home_device_count = sum(completely_home_device_count),
                                    device_count = sum(device_count)) %>% 
                          mutate(`% Completely at Home` = (completely_home_device_count/device_count*100) %>% round(1)),
            by = c("blockgroup" = "origin_census_block_group")
  ) %>% 
  filter(
    !is.na(device_count)
  ) 


# now we can create a scatter plot of median income v % completely at home
sj_median_income_by_block %>% 
  filter(Median_Income > 0) %>% 
  ggplot(aes(
  x = Median_Income,
  y = `% Completely at Home`
)) + 
  geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Median Household Income",
    y = "% residents completely at home in past 3 days",
    title = "San Jose: Social Distancing and Median Household Income"
  )

Percent Below Federal Poverty Line Analysis

My second analysis looks at the percent of households below the federal poverty line in each block group and how this correlates with the percent of residents completly at home in the past three days.

sj_poverty_by_block <-
  # first we need to load census data. I'm loading in a table of poverty status by household type
  getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*", 
    regionin = "state:06+county:085",
    vars = "group(B17017)",
    key =  "6c05445ae84c23e1c62fd91756d0f56f51ae94ef"
  ) %>%
  mutate(
    blockgroup =
      paste0(state,county,tract,block_group)
  ) %>% 
  select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>% 
  
  # The following code shows my process of selecting the right variables from the census data
  gather(
    key = code,
    value = number,
    - blockgroup
  ) %>% 
  left_join(acs_vars %>% select_if(names(.) %in% c("name", "label")), 
            by = c("code" = "name")) %>% 

  # at this point I was able to identify the two variables I needed for analysis
  filter(
    code %in% c("B17017_001E", "B17017_002E")
  ) %>% 
  select(-c("label")) %>% 
  spread(code, number) %>% 
  rename(
    Households_Total = B17017_001E,
    Households_Under_Povertyline = B17017_002E
  ) %>% 
  
  # now we can add a column for % of households below poverty line and combine it with the sj_socialdistancing df
  mutate(
    Households_Total = as.numeric(Households_Total),
    Households_Under_Povertyline = as.numeric(Households_Under_Povertyline),
    `% households below poverty line` = Households_Under_Povertyline / Households_Total * 100
  ) %>% 
  left_join(sj_median_income_by_block %>% dplyr::select(-Median_Income)
  ) %>% 
  filter(
    !is.na(device_count)  # this takes out block groups that aren't in SJ
  ) %>% 
  mutate(
    log_of_poverty = log(as.numeric(`% households below poverty line`))
  )

# now we can look at the relationship between % completely at home and % under poverty line on a scatter plot
sj_poverty_by_block %>% ggplot(aes(
  x = `% households below poverty line`,
  y = `% Completely at Home`
)) + 
  geom_point() +
  labs(
    y = "% residents completely at home in past 3 days",
    x = log(as.numeric("% households below poverty line")),
    title = "San Jose: Social Distancing and Poverty Status"
  )

The results are heteroscedastic, so we take the log of the x-variable

sj_poverty_by_block %>% 
  filter(
    log_of_poverty > 0  # taking out blockgroups with no poverty 
  ) %>% 
  ggplot(aes(
  x = log_of_poverty,
  y = `% Completely at Home`
)) + 
  geom_point() + 
  geom_smooth(method=lm) +
  labs(
    y = "Percent of devices completely at home in past 3 days",
    x = log(as.numeric("Percent of households below poverty line")),
    title = "San Jose: Social Distancing and Poverty Status"
  )

Percent Below 50%, 80% of AMI

Using area median income (AMI) is a more appropriate way to measure low-income status, as the cost of living is substantially higher than the federal average in San Jose. This analysis plots percent of devices entirely at home against percent of households with incomes 50% and 80% percent of Santa Clara County AMI.

sj_ami_by_block <-
  getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*", 
    regionin = "state:06+county:085",
    vars = "group(B19001)",
    key =  "6c05445ae84c23e1c62fd91756d0f56f51ae94ef"
  ) %>%
  mutate(
    blockgroup =
      paste0(state,county,tract,block_group)
  ) %>% 
  select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>% 
  dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
  group_by(blockgroup) %>% 
  summarize(
    Total = B19001_001E,
    `Under 75,000` = sum(B19001_002E, B19001_003E, B19001_004E, B19001_005E, B19001_006E, B19001_007E, B19001_008E, B19001_009E, B19001_010E, B19001_011E, B19001_012E),
    #sum(lapply(2:12, function(x) as.name(paste0("B19001_00",x,"E"))))
    `Under 100,000` = sum(B19001_002E, B19001_003E, B19001_004E, B19001_005E, B19001_006E, B19001_007E, B19001_008E, B19001_009E, B19001_010E, B19001_011E, B19001_012E, B19001_013E)
  ) %>% 
  mutate(
    `% under 75,000` = `Under 75,000` / Total * 100,
    `% under 100,000` = `Under 100,000` / Total * 100
  ) %>% 
  left_join(sj_median_income_by_block %>% dplyr::select(-Median_Income)
  ) %>% 
  filter(!is.na(device_count))

cor(sj_ami_by_block$`% under 75,000`, sj_ami_by_block$`% Completely at Home`)
## [1] -0.4182122
sj_ami_by_block %>% ggplot(aes(
  x = `% under 75,000`,
  y = `% Completely at Home`
)) + 
  geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Percent of households with incomes under $75,000 (50% AMI) annually",
    y = "Percent devices completely at home in past 3 days",
    title = "San Jose: Social Distancing and Households Above and Below 50% AMI"
  )

sj_ami_by_block %>% ggplot(aes(
  x = `% under 100,000`,
  y = `% Completely at Home`
)) + 
  geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Percent of households with incomes under $100,000 (80% AMI) annually",
    y = "Percent devices completely at home in past 3 days",
    title = "San Jose: Social Distancing and Households Above and Below 80% AMI"
  )

Using LODES Data

In this part of my analysis I load in a LODES dataset on SJ in order to compare percent of devices at home to a number of economic and employment factors. The

ca_rac <- 
  grab_lodes(
    state = "ca", 
    year = 2017, 
    lodes_type = "rac", 
    job_type = "JT01",
    segment = "S000", 
    state_part = "main",
    agg_geo = "bg"
  ) %>% 
  left_join(sj_blockgroups, by = c("h_bg" = "GEOID")) %>% 
  filter(
    !is.na(DISTRICTS)
  ) %>% 
  mutate(
    percent_healthcare_workers = CNS18 / C000 * 100
  ) %>% 
  dplyr::select(h_bg, DISTRICTS, percent_healthcare_workers) %>% 
  left_join(sj_median_income_by_block %>% dplyr::select(-Median_Income), by = c("h_bg" = "blockgroup")) %>% 
  filter(
    !is.na(device_count)
  )

ca_rac %>% ggplot(aes(
  x = percent_healthcare_workers,
  y = `% Completely at Home`
)) + 
  geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Percent of workers in health care and social assistance",
    y = "Percent devices completely at home in past 3 days",
    title = "San Jose: Social Distancing and Health Care Workers"
  )